import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#Users
u_cols = ['user_id', 'location', 'age']
users = pd.read_csv('../BookReview/Data/BX-Users.csv', sep=';', names=u_cols, encoding='latin-1',low_memory=False, skiprows=1)
#Books
i_cols = ['isbn', 'book_title' ,'book_author','year_of_publication', 'publisher', 'img_s', 'img_m', 'img_l']
items = pd.read_csv('../BookReview/Data/BX-Books.csv', sep=';', names=i_cols, encoding='latin-1',low_memory=False,skiprows=1)
#Ratings
r_cols = ['user_id', 'isbn', 'rating']
ratings = pd.read_csv('../BookReview/Data/BX-Book-Ratings.csv', sep=';', names=r_cols, encoding='latin-1',low_memory=False,skiprows=1)
users.head()
items.head()
ratings.head()
df = pd.merge(users, ratings, on='user_id')
df = pd.merge(df, items, on='isbn')
df.head()
# dimension of the dataset
df.shape
# number of users
df['user_id'].nunique()
# number of books
df['isbn'].nunique()
# summary statistics for numeric variables
df[['rating', 'age']].describe()
# max age = 244, might be an error.
df.age[df['age'] <120].max()
# number of books by year of publication (top ten years of most books published)
year_book = df.drop_duplicates(subset = ['year_of_publication', 'isbn'])[['year_of_publication', 'isbn']]
year_book.groupby(['year_of_publication'])['isbn'].count().reset_index(name="Number of Books").sort_values(by = "Number of Books", ascending=False).head(10)
# number of books by publisher
publisher_book = df.drop_duplicates(subset = ['publisher', 'isbn'])[['publisher', 'isbn']]
publisher_book.groupby(['publisher'])['isbn'].count().reset_index(name="Number of Books").sort_values(by = "Number of Books", ascending=False).head(10)
# Books with the most ratings
book_ratings = df.groupby(["book_title"])['user_id'].count().reset_index(name = 'number of ratings').sort_values(by = 'number of ratings', ascending = False)
Top10_book_ratings = book_ratings.head(10)
# draw a bar plot
plt.figure(figsize=(8,6))
sns.barplot(x='number of ratings',y='book_title',data=Top10_book_ratings)
plt.ylabel('Book title')
plt.title('Top 10 books with most ratings', size=20)
# check the average number of ratings each book received: 3.82
# total number of ratings/total number of books
x = df.rating.count()
y = df.isbn.nunique()
x/y
# books with the highest average ratings.
book_ave_ratings = df.groupby('book_title').agg(count=('isbn', 'count'), average_rating=('rating', 'mean')).reset_index()
# only focus on the books with at least 50 ratings
highest_rated_books = book_ave_ratings[book_ave_ratings['count'] >= 50].sort_values(by = 'average_rating', ascending = False).head(10)
# create a bar plot for the books with the highest average rating
plt.figure(figsize=(8,6))
sns.barplot(x='average_rating',y='book_title', data = highest_rated_books)
plt.ylabel('Book title')
plt.xlabel('Average rating')
plt.xticks(np.arange(0, 10.5, 0.5))
plt.title('Top 10 books with the highest average rating \n(on a scale of 0 to 10)', size=20)
# authors with most ratings
author_ratings = df.groupby(["book_author"])['user_id'].count().reset_index(name = 'number of ratings').sort_values(by = 'number of ratings', ascending = False)
top_author_ratings = author_ratings.head(10)
# plot a barchart for the top 10 authors with most ratings
plt.figure(figsize=(8,6))
sns.barplot(x='number of ratings',y='book_author', data = top_author_ratings)
plt.ylabel('Authors')
plt.xlabel('Number of ratings')
plt.xticks(np.arange(0, 10000, 1000))
plt.title('Top 10 authors with most ratings', size=20)
# creat a worldcloud for the famous authors (i.e. authors with most ratings)
!pip install wordcloud
from wordcloud import WordCloud,STOPWORDS
stop_words=set(STOPWORDS)
author_string = " ".join(df['book_author'].astype(str))
wc = WordCloud(width=600,height=400, max_font_size=100,stopwords=stop_words,background_color='white').generate(author_string)
fig=plt.figure(figsize=(10,8))
plt.axis('off')
plt.title('Wordcloud of Popular Authors',size=20)
plt.imshow(wc, interpolation='bilinear') # interpolation = 'bilinear' to make the displayed image appear more smoothly
# authors with the highest average rating.
author_rating = df.groupby('book_author').agg(count=('isbn', 'count'), average_rating=('rating', 'mean')).reset_index()
# average number of ratings per author (aaverage number of ratings per author: 10.15)
author_rating['count'].mean()
# limited to authors with at least 50 ratings
top_author_rating = author_rating[author_rating['count'] >= 50].sort_values(by = 'average_rating', ascending = False).head(10)
# plot a bar chart for top 10 authors with the highest average rating
plt.figure(figsize=(8,6))
sns.barplot(x='average_rating',y='book_author', data = top_author_rating)
plt.ylabel('Authors')
plt.xlabel('Average rating')
plt.xticks(np.arange(0, 11, 1))
plt.title('Top 10 authors with the highest average rating', size=20)
# the age distribution of the users
age_trunc = df[df['age'] <= 100]
# drop the duplicated users
age_trunc = age_trunc.drop_duplicates(subset = ['user_id'])['age']
# sns.distplot(age_trunc, bins=30, kde=True)
plt.hist(age_trunc, 30, density=True, facecolor='lightblue', alpha=1)
plt.xlabel('Age')
plt.ylabel('Probability')
plt.title('Histogram of users\' age')
plt.xlim(0,90)
x_ticks = np.arange(0, 90, 5)
plt.xticks(x_ticks)
plt.ylim(0, 0.04)
plt.grid(True)
plt.show()
# the location (country) of the users
# split the location into city, state, and country
df[['city','state','country']] = df["location"].str.split(", ", expand=True, n=2)
# drop the duplicated users
users = df.drop_duplicates(subset = ['user_id'])
country = users.groupby('country')['user_id'].count().reset_index(name = 'count')
# drop the countries with less than 1000 users
country2 = country[country['count'] >500 ]
# plot a pie chart to show where do the users come from.
import plotly.express as px
fig = px.pie(country2,
values="count",
names="country",
title="Home country of users",
template="seaborn")
fig.update_traces(textposition="inside", textinfo="value+percent+label")
fig.show()
# Where do the US users come from? (state)
US_users = users[users.country == "usa"]
state = US_users.groupby('state')['user_id'].count().reset_index(name = 'count')
state2 = state[state['count'] > 1500]
fig = px.pie(state2,
values="count",
names="state",
title="Location (state) of the US users",
template="seaborn")
fig.update_traces(textposition="inside", textinfo="value+percent+label")
fig.show()
# Where do the US users come from? (city)
city = US_users.groupby('city')['user_id'].count().reset_index(name = 'count')
city2 = city[city['count'] > 250]
fig = px.pie(city2,
values="count",
names="city",
title="Location(city) of the US users",
template="seaborn")
fig.update_traces(textposition="inside", textinfo="value+percent+label")
fig.show()
# The top 10 users with the most ratings.
user_rating = df.groupby('user_id')['user_id'].count().reset_index(name = 'number of ratings').sort_values(by = 'number of ratings', ascending = False)
top10_users = user_rating.head(10)
# plot a bar chart
top10_users['number of ratings'].plot.bar(x='User ID', y='Number of ratings', color = 'lightblue')
# Add title and axis names
plt.title('The top 10 users with the most ratings')
plt.xlabel('User ID')
plt.ylabel('Number of ratings')
plt.show()
mydf = df[['user_id', 'book_title', 'rating']]
# drop duplicates
mydf = mydf.drop_duplicates(subset = ['user_id', 'book_title'])
# I consider 'rating = 0' as a valid rating
# 382,203 observations with rating > 0
# 644,194 observations with rating = 0
To get an idea about a user’s preference, we need a user who’s rated at least 5 books. Similarly, to derive predictions regarding book recommendations, we need books that have been rated at least 5 times.
# restrict to books with at least five ratings (quality books)
books = mydf['book_title'].value_counts().rename_axis('book_title').reset_index(name = 'count')
books = books[books['count']>5]['book_title'].to_list()
quality_rating = mydf[mydf['book_title'].isin(books)]
# restric to users with at least 5 ratings (quality users)
users = mydf['user_id'].value_counts().rename_axis('user_id').reset_index(name = 'count')
users = users[users['count']>5]['user_id'].to_list()
quality_rating = quality_rating[quality_rating['user_id'].isin(users)]
quality_rating.head()
# No. of observations reduced from 1,026,397 to 585,687
# there is no missing value
quality_rating.isnull().sum()
Some users may be tougher than the others, that is, they tend to always give ratings lower than the average. So we need to bring all users to the same level by removing their bias. I subtract the average rating given by each user to all books from each book rated by that user (i.e. demeaning). By doing this, we have changed the value of average rating given by every user to 0, which brings them all to the same level and remove their bias.
# demeaning the data
quality_rating['trans_rating'] = quality_rating['rating'] - quality_rating.groupby('user_id')['rating'].transform('mean')
When converting to pivot table, we are working with an extremely sparse matrix. We fill the missing values with 0.
# pivot table
book_ratings = pd.pivot_table(quality_rating, index = 'book_title', columns = 'user_id', values = 'trans_rating', fill_value=0)
A lot of values in the pivot table are zero. Thus, we’re dealing with extremely sparse data. In such a case, we need to work with a scipy-sparse matrix to avoid overflow and wasted memory.
# transform to scipy-sparse matrix
from scipy.sparse import csr_matrix
book_ratings_sparse = csr_matrix(book_ratings)
from sklearn.neighbors import NearestNeighbors
#make an object for the NearestNeighbors Class.
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
# fit the dataset
model_knn.fit(book_ratings_sparse)
#distances, indices = model_knn.kneighbors(book_ratings_sparse)
# get the list of book titles
book_titles = book_ratings.index.to_list()
# get the cover image of the books
book_cover = items[items['book_title'].isin(books)]['img_m']
# valid book titles
titles = quality_rating.drop_duplicates(subset = 'book_title')
# all books with images in the original book data
all_books = items.drop_duplicates(subset = 'book_title')
# merge with the original book data
merged_book = pd.merge(all_books, titles, on = 'book_title', how='right')
merged_book = merged_book.sort_values(by = 'book_title')
from IPython.display import Image
from skimage import io
def make_recommendation(model_knn, data, fav_book, n_recommendations):
# fit the model
model_knn.fit(data)
query_index = book_titles.index(fav_book)
distances, indices = model_knn.kneighbors(data[query_index], n_neighbors=n_recommendations+1)
raw_recommends = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]
print('You have input book:', fav_book)
print('Recommendation system starts to make inference')
print('......\n')
print('Recommendations for {}:'.format(fav_book))
for i, (idx, dist) in enumerate(raw_recommends):
url = merged_book.iloc[idx]['img_m']
print('{0}: {1}, with distance of {2}'.format(i+1, book_titles[idx], dist))
io.imshow(io.imread(url))
io.show()
# Example 1
my_book = 'The Little Prince'
make_recommendation(model_knn, book_ratings_sparse, my_book, 5)
# Example 2
my_book = 'Harry Potter and the Goblet of Fire (Book 4)'
make_recommendation(model_knn, book_ratings_sparse, my_book, 5)
We do not have detailed information about the content of the books, such as genre, or content description. The only information we might use is the title of the books.
from sklearn.feature_extraction.text import TfidfVectorizer
titles = merged_book['book_title']
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = vectorizer.fit_transform(titles)
tfidf_matrix.shape # 31,630 books with 19,263 words in the titles (bag of words)
The tfidf_matrix is the matrix containing each word and its TF-IDF score with regard to each book title. Also, stop words are simply words that add no significant value to our system, like ‘an’, ‘is’, ‘the’, and hence are ignored by the system. Now, we have a representation of every book in terms of the words of the titles. Next, we need to calculate the relevance or similarity of one book to another.
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_similarities.shape
cosine_similarities[0] # the first row
results = {}
for idx, row in ds.iterrows():
similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]
results[row['id']] = similar_items[1:]
# vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english')
# vectors = vectorizer.fit_transform(corpus)
# feature_names = vectorizer.get_feature_names()
# dense = vectors.todense()
# denselist = dense.tolist()
# df = pd.DataFrame(denselist, columns=feature_names)
# df